6 Microbial metagenomics

6.1 Microbial DNA fraction

6.1.1 Data overview

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(sample_type) %>%
    summarise(mean=mean(singlem_fraction, na.rm=T),
              sd=sd(singlem_fraction, na.rm=T),
              median=median(singlem_fraction, na.rm = TRUE),
              IQR=IQR(singlem_fraction, na.rm = TRUE)) %>% 
    tt()
tinytable_tf775bqva3a1jewbw7tn
sample_type mean sd median IQR
Anal/cloacal swab 0.2598510 0.3407707 0.0022 0.574700
Faecal 0.5203527 0.2759433 0.6270 0.350875
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(tax_group) %>%
    summarise(mean=mean(singlem_fraction, na.rm=T),
              sd=sd(singlem_fraction, na.rm=T),
              median=median(singlem_fraction, na.rm = TRUE),
              IQR=IQR(singlem_fraction, na.rm = TRUE)) %>% 
    tt()
tinytable_sveyeg44o64nnzq21hcn
tax_group mean sd median IQR
Amphibians 0.5807892 0.1799217 0.62900 0.128200
Bats 0.2903304 0.2949997 0.20650 0.523400
Birds 0.2777113 0.3360460 0.07850 0.556025
Mammals 0.5871579 0.2841097 0.67420 0.175250
Reptiles 0.5885336 0.1701545 0.63475 0.114050

6.1.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(singlem_fraction))%>%
    mutate(singlem_fraction=case_when(singlem_fraction>1~1,
                                      singlem_fraction<=1~singlem_fraction))%>%
    glm(singlem_fraction ~  tax_group + sample_type, data = .,family=quasibinomial)  %>%
    Anova(.,test.statistic = "F",type = "III") %>%
    tidy()%>%
    tt()
tinytable_67dtwep878yway81cx60
term sumsq df F.values p.value
tax_group 148.8863 4 119.1247 2.466079e-91
sample_type 51.7346 1 165.5725 1.784789e-36
Residuals 630.8543 2019 NA NA
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(singlem_fraction))%>%
    mutate(singlem_fraction=case_when(singlem_fraction>1~1,
                                      singlem_fraction<=1~singlem_fraction),
           sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(singlem_fraction ~  tax_group + sample_type, data = .,family=quasibinomial)  %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_nc4sam9strsvr4rgfgb6
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.25097357 0.10967066 -11.4066383 0.000000e+00
tax_group Birds - Amphibians 0 -0.88809563 0.11334382 -7.8354131 2.131628e-14
tax_group Mammals - Amphibians 0 0.16650972 0.10026122 1.6607589 4.507172e-01
tax_group Reptiles - Amphibians 0 0.03298124 0.10207377 0.3231118 9.975621e-01
tax_group Birds - Bats 0 0.36287794 0.09724522 3.7315759 1.884568e-03
tax_group Mammals - Bats 0 1.41748328 0.08113495 17.4706869 0.000000e+00
tax_group Reptiles - Bats 0 1.28395481 0.08304328 15.4612727 0.000000e+00
tax_group Mammals - Birds 0 1.05460535 0.08202966 12.8563901 0.000000e+00
tax_group Reptiles - Birds 0 0.92107687 0.08779908 10.4907351 0.000000e+00
tax_group Reptiles - Mammals 0 -0.13352848 0.07012718 -1.9040901 3.082301e-01

6.1.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    mutate(host_percentage= host_bases/bases_post_fastp*100)  %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=singlem_fraction, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0, 1) +
        geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type) +
        labs(y="Host percentage", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/microbialdata_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    mutate(host_percentage= host_bases/bases_post_fastp*100)  %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=singlem_fraction, x=sample_type, group=sample_type)) +
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Host percentage", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/microbialdata_taxa_all.pdf",width=9, height=4, units="in")

6.2 Domain-adjusted mapping rate

6.2.1 Data summary

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(sample_type) %>%
    summarise(mean=mean(damr, na.rm=T),
              sd=sd(damr, na.rm=T),
              median=median(damr, na.rm = TRUE),
              IQR=IQR(damr, na.rm = TRUE)) %>% 
    tt()
tinytable_ad95sn3f2m9h3xg92cv3
sample_type mean sd median IQR
Anal/cloacal swab 72.84533 22.25244 76.29903 30.61081
Faecal 86.58154 25.00932 100.00000 15.61039
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(tax_group) %>%
    summarise(mean=mean(damr, na.rm=T),
              sd=sd(damr, na.rm=T),
              median=median(damr, na.rm = TRUE),
              IQR=IQR(damr, na.rm = TRUE)) %>% 
    tt()
tinytable_ysulhkotxnia2zqa2ith
tax_group mean sd median IQR
Amphibians 89.19356 24.84575 100.00000 5.927094
Bats 80.94051 24.67483 91.76713 30.814236
Birds 67.03815 24.99000 72.18568 29.811183
Mammals 91.06092 16.77450 100.00000 11.191039
Reptiles 84.52325 31.14150 100.00000 10.748612

6.2.2 Statistical test

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,11,singlem_fraction)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,1,mapping_percentage/singlem_fraction)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(damr)) %>%
    glm(damr ~ sample_type + tax_group, data = .,family=quasibinomial)  %>%
    Anova(test.statistic = "F",type="III") %>%
    tidy()%>%
    tt()
tinytable_jk0jt97q9809f3j0hutw
term sumsq df F.values p.value
sample_type 13.49334 1 19.83706 8.794973e-06
tax_group 76.64409 4 28.16933 6.244313e-23
Residuals 1742.01471 2561 NA NA
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,11,singlem_fraction)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,1,mapping_percentage/singlem_fraction)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(damr)) %>%
    mutate(sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(damr ~ sample_type + tax_group, data = .,family=quasibinomial) %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_q1ycocu0z24chvb0sltx
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.5736540 1.1220474 -1.4024844 5.926804e-01
tax_group Birds - Amphibians 0 -2.7097758 1.0700747 -2.5323240 7.211060e-02
tax_group Mammals - Amphibians 0 0.3210380 1.1686087 0.2747181 9.985217e-01
tax_group Reptiles - Amphibians 0 -3.2127466 1.0255111 -3.1328247 1.257651e-02
tax_group Birds - Bats 0 -1.1361218 0.5793468 -1.9610391 2.562729e-01
tax_group Mammals - Bats 0 1.8946920 0.7458078 2.5404560 7.058680e-02
tax_group Reptiles - Bats 0 -1.6390926 0.4921951 -3.3301684 6.500154e-03
tax_group Mammals - Birds 0 3.0308138 0.6650554 4.5572350 4.222758e-05
tax_group Reptiles - Birds 0 -0.5029708 0.3582254 -1.4040625 5.916447e-01
tax_group Reptiles - Mammals 0 -3.5337846 0.5906877 -5.9824922 1.667799e-08

6.2.3 Plot

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(singlem_fraction>0) %>% 
    ggplot(aes(y=damr, x=sample_type, color=sample_type, fill=sample_type, group=sample_type)) +
        geom_boxplot(outlier.shape = NA) +
        scale_color_manual(values = c("#bdca50", "#AA3377")) +   
        scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
        theme_classic() +
        labs(y="DAMR", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/damr_type.pdf",width=5, height=4, units="in")
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(singlem_fraction>0) %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=damr, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="DAMR", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/damr_taxa.pdf",width=9, height=4, units="in")

6.3 Assemblies

6.3.1 Data summary

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(sample_type) %>%
    summarise(mean=mean(assembly_length, na.rm=T),
              sd=sd(assembly_length, na.rm=T),
              median=median(assembly_length, na.rm = TRUE),
              IQR=IQR(assembly_length, na.rm = TRUE)) %>% 
    tt()
tinytable_95apjn1fwqh76nxzseaw
sample_type mean sd median IQR
Anal/cloacal swab 13208042 34504175 0 0
Faecal 95955306 83010036 97047664 151385731
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(tax_group) %>%
    summarise(mean=mean(assembly_length, na.rm=T),
              sd=sd(assembly_length, na.rm=T),
              median=median(assembly_length, na.rm = TRUE),
              IQR=IQR(assembly_length, na.rm = TRUE)) %>% 
    tt()
tinytable_wd9hkty6c84tjx5ar0i8
tax_group mean sd median IQR
Amphibians 116614689 68762523 121907456 88263039
Bats 31069176 77144825 0 41984504
Birds 8688016 26409919 0 0
Mammals 96415766 69841084 98624268 75754888
Reptiles 139349280 74190452 143970512 93046444

6.3.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    lm(rank(assembly_length) ~ sample_type + tax_group, data = .)  %>%
    Anova(type="III") %>%
    tidy()%>%
    tt()
tinytable_1w85vvi9ec33abg905m8
term sumsq df statistic p.value
(Intercept) 33267819 1 314.2467 3.650912e-64
sample_type 13028568 1 123.0674 1.427942e-27
tax_group 100440135 4 237.1885 5.790079e-159
Residuals 163985333 1549 NA NA

6.3.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=assembly_length, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,400000000)+
        geom_jitter(alpha = 0.3, width=0.3, size=0.5) +
        geom_violin() +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type, scale="free") +
        labs(y="Assembly size", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/assemblysize_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_length, x=sample_type, group=sample_type)) +
          ylim(0,400000000)+
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Assembly size", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/assemblysize_taxa_all.pdf",width=9, height=4, units="in")

6.4 Number of MAGs

6.4.1 Summary statistics

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(sample_type) %>%
    summarise(mean=mean(assembly_num_bins, na.rm=T),
              sd=sd(assembly_num_bins, na.rm=T),
              median=median(assembly_num_bins, na.rm = TRUE),
              IQR=IQR(assembly_num_bins, na.rm = TRUE)) %>% 
    tt()
tinytable_q9qaojqkg65ckwn08407
sample_type mean sd median IQR
Anal/cloacal swab 2.644068 6.971768 0 0
Faecal 18.386792 16.099835 19 29
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(tax_group) %>%
    summarise(mean=mean(assembly_num_bins, na.rm=T),
              sd=sd(assembly_num_bins, na.rm=T),
              median=median(assembly_num_bins, na.rm = TRUE),
              IQR=IQR(assembly_num_bins, na.rm = TRUE)) %>% 
    tt()
tinytable_rdl9665mnnmmn5vqyjnu
tax_group mean sd median IQR
Amphibians 23.758389 14.944334 24 18
Bats 3.692308 7.451098 0 5
Birds 1.582979 5.600340 0 0
Mammals 22.062635 16.345908 23 19
Reptiles 23.924107 13.736366 24 18

6.4.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    glm(assembly_num_bins ~ sample_type + tax_group, data = .,family=quasipoisson)  %>%
    Anova(test.statistic = "F",type = "III") %>%
    tidy()%>%
    tt()
tinytable_s424hg5xif1tmyw9t6sp
term sumsq df F.values p.value
sample_type 1781.293 1 131.4547 2.875375e-29
tax_group 9870.795 4 182.1096 5.599140e-128
Residuals 20989.917 1549 NA NA
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(assembly_num_bins ~ sample_type + tax_group, data = .,family=quasipoisson)  %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_osmz374on0cqf3pxbmeh
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.89252007 0.13396130 -14.12736443 0.0000000
tax_group Birds - Amphibians 0 -2.37578946 0.20160147 -11.78458429 0.0000000
tax_group Mammals - Amphibians 0 -0.02494121 0.07183869 -0.34718348 0.9962961
tax_group Reptiles - Amphibians 0 -0.00222888 0.07136120 -0.03123378 0.9999997
tax_group Birds - Bats 0 -0.48326938 0.22581601 -2.14010238 0.1759741
tax_group Mammals - Bats 0 1.86757887 0.12433118 15.02100124 0.0000000
tax_group Reptiles - Bats 0 1.89029119 0.12401954 15.24188255 0.0000000
tax_group Mammals - Birds 0 2.35084825 0.19506107 12.05185782 0.0000000
tax_group Reptiles - Birds 0 2.37356058 0.19518653 12.16047329 0.0000000
tax_group Reptiles - Mammals 0 0.02271233 0.05098722 0.44545131 0.9903558

6.4.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=assembly_num_bins, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,100) +
        geom_jitter(alpha = 0.6, width=0.3, size=0.5) +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type, scale="free") +
        labs(y="Number of MAGs", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/numberofbins_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_num_bins, x=sample_type, group=sample_type)) +
        ylim(0,100) +
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Number of MAGs", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/numberofbins_taxa_all.pdf",width=9, height=4, units="in")

6.5 MAG quality

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=mag_completeness, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(50,100) +
        #geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5, width = 0.5, .width = 0,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Number of bins", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/completeness_taxa.pdf",width=5, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=mag_contamination, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,10) +
        #geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5, width = 0.5, .width = 0,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Number of bins", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/contamination_taxa.pdf",width=5, height=4, units="in")

6.6 Assemblies vs MAGs

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual")%>%
    with(cor(assembly_num_bins,assembly_length))
[1] 0.8511555
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_num_bins, x=assembly_length, color=tax_group)) +
        geom_point(alpha=0.5, size=0.5) +
        scale_color_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        geom_smooth(method="lm",se=FALSE)+
        theme_classic() +
        labs(y="Number of bins", x="Assembly length", color="Taxa", fill="Taxa")